library(ggplot2)
library(tidyverse)
library(lubridate)
setwd("/Volumes/Courses/ES218_A_SP/wokopa26/Vehicular accidents")
Accidents_Original <- readRDS("farsp.RDS")
alcohol <- read.csv("alcohol.csv")
alcohol <- alcohol %>%
mutate(description = if_else(description %in% c("Unknown", "Not reported"), NA, description))
body_typ <- read.csv("body_typ.csv")
body_typ <- body_typ %>%
mutate(description = if_else(description %in% c("Unknown body type", "Not Reported"), NA, description))
col_names <- read.csv("column_name_description.csv")
inj_sev <- read.csv("inj_sev.csv")
inj_sev <- inj_sev %>%
mutate(description = if_else(description %in% c("Unknown/Not Reported", "N/A"), NA, description))
man_coll <- read.csv("man_coll.csv")
man_coll <- man_coll %>%
mutate(description = if_else(description %in% c("Not Reported", "Unknown"), NA, description))
sex <- read.csv("sex.csv")
sex <- sex %>%
mutate(description = if_else(description %in% c("not reported", "unknown"), NA, description))
state_code <- read.csv("state_code.csv")
col_names <- setNames(col_names$column_name, col_names$description)
names(col_names) <- gsub(" ", "_", names(col_names))
Names_Changed <- Accidents_Original %>%
rename(alcohol = drinking) %>%
left_join(state_code, by = c("state" = "state_code")) %>%
left_join(sex, by = "sex") %>%
left_join(man_coll, by = "man_coll") %>%
left_join(inj_sev, by = "inj_sev") %>%
left_join(body_typ, by = "body_typ") %>%
left_join(alcohol, by = "alcohol") %>%
mutate(across(c(year, month, day, hour, minute, age), ~replace(., . == 99, NA)),
across(c(age), ~replace(., . == 999, NA))) %>%
drop_na(c("year", "month", "day", "hour", "minute")) %>%
mutate(
state = as.character(state_name),
sex = as.character(description.x),
man_coll = as.character(description.y),
alcohol = as.character(description),
inj_sev = as.character(description.y),
body_typ = as.character(description.y),
Datetime = ymd_hm(paste(year, month, day, hour, minute)),
Month_Name = month(month, label = TRUE)
) %>%
rename(!!!col_names) %>%
mutate(
State = as.factor(State),
County = as.factor(County),
Manner_of_collision = as.factor(Manner_of_collision),
Vehicle_body_type = as.factor(Vehicle_body_type),
Sex = as.factor(Sex),
Injury_severity = as.factor(Injury_severity),
Datetime = as.POSIXct(Datetime),
Date = as.Date(Datetime),
Accident_weekday = wday(Datetime, label = TRUE),
Month_Name = month(Datetime, label = TRUE), # create month name column
Year = year(Datetime)
) %>%
filter(Age < 100) %>%
select(-State, -description.x, -description.y, -description.x.x, -description.y.y,-description)
# 1. Number of accidents per state
ggplot(Names_Changed %>% filter(!is.na(state_name)), aes(x = state_name)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Number of Accidents per State")

# 2. Number of accidents per month
ggplot(Names_Changed %>% filter(!is.na(Datetime)), aes(x = month(Datetime, label = TRUE))) +
geom_bar() +
ggtitle("Number of Accidents per Month")

# 3. Time series plot
ggplot(Names_Changed, aes(x = Date, y = after_stat(count))) +
geom_line(stat = "count") +
labs(x = "Date", y = "Accidents", title = "Accident Rates Over Time")

# 4. Number of accidents per hour
ggplot(Names_Changed %>% filter(!is.na(Accident_hour)), aes(x = Accident_hour)) +
geom_bar() +
ggtitle("Number of Accidents per Hour")

# 5. Distribution of the age of persons involved in accidents
ggplot(Names_Changed %>% filter(!is.na(Age)), aes(x = Age)) +
geom_histogram(binwidth = 1) +
ggtitle("Distribution of Age of Persons Involved in Accidents")

# 6. Number of accidents by sex
ggplot(Names_Changed %>% filter(!is.na(Sex)), aes(x = Sex)) +
geom_bar() +
ggtitle("Number of Accidents by Sex")

# 7. Number of accidents with alcohol involvement
ggplot(Names_Changed %>% filter(!is.na(Alcohol_involvement)), aes(x = Alcohol_involvement)) +
geom_bar() +
ggtitle("Number of Accidents with Alcohol Involvement")

# 8. Distribution of injury severity - Duplicate?
ggplot(Names_Changed %>% filter(!is.na(Injury_severity)), aes(x = Injury_severity)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Distribution of Injury Severity")

# 9. Number of accidents by manner of collision - Duplicate?
ggplot(Names_Changed %>% filter(!is.na(Manner_of_collision)), aes(x = Manner_of_collision)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Number of Accidents by Manner of Collision")

# 10. Number of accidents by vehicle body type
ggplot(Names_Changed %>% filter(!is.na(Vehicle_body_type)), aes(x = Vehicle_body_type)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Number of Accidents by Vehicle Body Type")

# 11. Histogram of age
ggplot(Names_Changed, aes(x = Age)) +
geom_histogram(binwidth = 1) +
labs(x = "Age", y = "Count", title = "Age Distribution of Persons Involved in Accidents")

# 12. Box plot of age by injury severity
ggplot(Names_Changed, aes(x = Injury_severity, y = Age)) +
geom_boxplot() +
labs(x = "Injury Severity", y = "Age", title = "Age Distribution by Injury Severity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

# 13. Bar plot of manner of collision
ggplot(Names_Changed, aes(x = Manner_of_collision)) +
geom_bar() +
labs(x = "Manner of Collision", y = "Count", title = "Manner of Collision Distribution") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

# 14. Plot number of accidents by hour of the day
accidents_by_hour_month <- Names_Changed %>%
group_by(Accident_hour, Month_Name) %>%
summarize(Count = n()) %>%
ungroup()
## `summarise()` has grouped output by 'Accident_hour'. You can override using the
## `.groups` argument.
# 15. Plot the stacked bar plot of accidents by hour of the day, colored by month
ggplot(accidents_by_hour_month, aes(x = Accident_hour, y = Count, fill = factor(Month_Name))) +
geom_col(position = "stack") +
labs(title = "Number of Accidents by Hour of the Day and Month") +
scale_fill_discrete(name = "Hour of the Day") +
theme_minimal()

# 16. Scatter plot showing the relationship between age and the number of persons involved in accidents.
ggplot(Names_Changed, aes(x = Age, y = Number_of_persons_involved)) +
geom_point(alpha = 0.3) +
labs(x = "Age", y = "Number of persons involved")

# 17. Heatmap showing the density of accidents based on the day of the month and the time of day. - Does this show that something is at like 8pm that is odd. There are a few times that seem to be defaults.
ggplot(Names_Changed, aes(x = Accident_hour, y = Accident_day)) +
geom_bin2d(bins = 20) +
labs(x = "Hour of day", y = "Day of month", fill = "Number of accidents")

# 18. Bar plot showing the total number of accidents on each day of the week.
ggplot(Names_Changed, aes(x = Accident_weekday)) +
geom_bar() +
labs(x = "Day of week", y = "Number of accidents")

# 19. Bar plot showing the total number of accidents in each year.
ggplot(Names_Changed, aes(x = Accident_year)) +
geom_bar() +
labs(x = "Year", y = "Number of accidents")

# 20. Scatter plot showing the relationship between age and the time of day when accidents occur.
ggplot(Names_Changed, aes(x = Accident_hour, y = Age)) +
geom_point(alpha = 0.0005) +
labs(x = "Hour of day", y = "Age")

# 21. Stacked bar plot showing the number of accidents for each vehicle body type, colored by the injury severity.
ggplot(Names_Changed, aes(x = Vehicle_body_type, fill = Injury_severity)) +
geom_bar(position = "stack") +
labs(x = "Vehicle body type", y = "Number of accidents")

# 22. Box plot showing the distribution of age for each sex involved in accidents.
ggplot(Names_Changed, aes(x = Sex, y = Age)) +
geom_boxplot() +
labs(x = "Sex", y = "Age")

# 23. Stacked bar plot showing the number of accidents for each month, colored by the injury severity.
ggplot(Names_Changed, aes(x = Month_Name, fill = Injury_severity)) +
geom_bar() +
labs(x = "Month", y = "Number of accidents")+ theme(axis.text.x = element_text(angle = 90, hjust = 1))

# 24. Density plot showing the distribution of the number of persons involved in accidents.
ggplot(subset(Names_Changed, Number_of_persons_involved < 10), aes(x = Number_of_persons_involved)) +
geom_density() +
labs(x = "Number of persons involved", y = "Density")

# 25. Box plot showing the distribution of age for each manner of collision.
ggplot(Names_Changed, aes(x = Manner_of_collision, y = Age)) +
geom_boxplot()

# 26. Heatmap of accidents by hour and day of the week.
accidents_by_hour_weekday <- Names_Changed %>%
group_by(hour = hour(Datetime), weekday = wday(Datetime, label = TRUE)) %>%
summarize(n = n())
## `summarise()` has grouped output by 'hour'. You can override using the
## `.groups` argument.
ggplot(accidents_by_hour_weekday, aes(x = hour, y = weekday, fill = n)) +
geom_tile() +
scale_fill_gradient(low = "#f7f7f7", high = "#e31a1c") +
labs(x = "Hour of day", y = "Day of week", fill = "Number of accidents")

# 27. Faceted bar plot of accidents by vehicle body type and injury severity, colored by sex.
accidents_by_body_type_injury_sex <- Names_Changed %>%
filter(!is.na(Vehicle_body_type), !is.na(Injury_severity)) %>%
group_by(Vehicle_body_type, Injury_severity, Sex) %>%
summarize(n = n())
## `summarise()` has grouped output by 'Vehicle_body_type', 'Injury_severity'. You
## can override using the `.groups` argument.
ggplot(accidents_by_body_type_injury_sex, aes(x = Vehicle_body_type, y = n, fill = Injury_severity))+
geom_col(position = "dodge") +
facet_wrap(~ Sex, scales = "free_y") +
labs(x = "Vehicle body type", y = "Number of accidents", fill = "Injury severity")

# 28. Relationship between Hour of Day and Age of Persons Involved in Accidents, Colored by Injury Severity and Faceted by Sex.
ggplot(Names_Changed, aes(x = Accident_hour, y = Age, color = Injury_severity)) +
geom_point(alpha = 0.01) +
facet_wrap(~ Sex) +
labs(x = "Hour of day", y = "Age", color = "Injury severity")+
guides(color = guide_legend(override.aes = list(alpha = 1)))

accidents_per_day <- Names_Changed %>%
mutate(DoY = yday(Datetime), Month = month(Datetime), Day = day(Datetime), Hour = hour(Datetime)) %>%
group_by(Month, Day, Hour) %>%
summarize(count = n()) %>%
ungroup()
## `summarise()` has grouped output by 'Month', 'Day'. You can override using the
## `.groups` argument.
ggplot(accidents_per_day, aes(x = Day, y = Month, fill = count)) +
geom_tile(color = "white") +
scale_y_continuous(trans = 'reverse', breaks = 1:12, labels = month.abb) +
labs(x = "Day", y = "Month", fill = "Accidents", title = "Accidents per Day of the Year (Aggregating All Years)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

##
ggplot(accidents_per_day, aes(x = Day, y = Hour, fill = count - mean(count))) +
geom_tile(width = 0.5) +
scale_y_reverse() +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
facet_wrap(~ month(Month, label = TRUE)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0, hjust = 1)) +
labs(x = "Day", y = "Time", fill = "Accidents", title = "New Year\'s Eve and nighttime is dangerous!")

# Bivariate relationships - Scatter plot:
ggplot(data = Names_Changed, aes(x = Age, y = Number_of_persons_involved)) +
geom_point(alpha = 0.05) +
theme_minimal()

# Time-based analysis - Seasonal decomposition plot:
monthly_accidents <- Names_Changed %>%
mutate(Month = floor_date(Datetime, "month")) %>%
count(Month)
ggplot(monthly_accidents, aes(x = Month, y = n)) +
geom_line() +
theme_minimal() +
labs(x = "Year", y = "Number of Accidents")

Next split into states or reigions like New England where hours of
daylight will change and may cause more accidents
Maybe use MWStats for time series analysis